Setup

# Warnings and startup messages suppressed
library(tidyverse)
library(patchwork)
library(scales)
library(ggrepel)
library(readxl)
library(here)
# Import data
SpeciesDetections <- read_csv(here("OCNMS_Project", "Outputs", "SOI_IDs_Species10Detections.csv")) %>% 
  filter(year(Date_UTC) != 2023)
## Rows: 532 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (19): Family, Genus, Species, JV_Sample_Name, Barcode.x, Barcode_mod, S...
## dbl   (8): Biological_Replicate, Technical_Replicate, Depth_m, Lat_dec, Lon_...
## lgl   (1): Present
## dttm  (2): Date_UTC, Date_local
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
EnvData1 <- read_csv(here("OCNMS_Project", "Data", "EnvironmentalDataset1.csv")) %>%  # Using EnvironmentalDataset1 because the satellite + NEMO data in EnvironmentalDataset2 didn't turn out to be good at gap filling
  filter(year != 2023) %>%  # Ignoring 2023 due to gaps for now
  mutate(year = as.factor(year))
## New names:
## Rows: 38938 Columns: 11
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): source dbl (8): ...1, year, temperature, DO, salinity, potential_density,
## pres, cond dttm (2): date, sampleID
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
SamplingDates2 <- read_csv(here("OCNMS_Project", "Data", "SamplingDates2.csv")) %>%  # Exported from EnvironmentalDataxSampleDates.Rmd, simple dataframe of all datetimes of samples
  filter(year(Date) != 2023) %>% # Ignoring 2023 due to gaps for now
  mutate(Source = case_when(Source == "Bottle_DNA" ~ "Bottle_DNA_Sampled", Source == "PPS_DNA" ~ "Automated_DNA_Sampler"))
## New names:
## Rows: 109 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): Source dbl (1): ...1 dttm (1): Date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

# Date check

# Let's take a quick look at whether the dates match up
sampdates <- data.frame(Date = SamplingDates2$Date, Source = "Metadata")
detectdates <- data.frame(Date = SpeciesDetections$Date_UTC, Source = "Detections")
datescomp <- rbind(sampdates,detectdates)

ggplot() +
  geom_vline(data = datescomp, mapping = aes(xintercept = Date, color = Source, linetype = Source)) +
  scale_linetype_manual(values = c(2,1)) +
  theme_bw() +
  facet_wrap(facets = vars(year(Date)), scales = "free_x", ncol = 1) #+
  #scale_x_datetime(date_breaks = "4 days", date_labels = "%y-%b-%d") # This is fucked but tbh I don't need it

Sampling x environmental data graph

ggplot(EnvData1, aes(x = date, y = DO, color = source, alpha = source)) +
  geom_point() +
  scale_alpha_manual(values = c(1,0.1)) + # doesn't currently do anything
  scale_x_datetime(date_breaks = "1 month", date_labels = "%m-%y") +
  theme_bw() +
  facet_wrap(facets = vars(year), scales = "free_x", ncol = 1) +
  geom_vline(data = SamplingDates2, aes(xintercept = Date, color = Source)) +
  scale_color_manual(values = c("purple1", "black", "firebrick2", "dodgerblue2")) +
  geom_hline(aes(yintercept = 2), color = "black", linetype = "dashed") +
  labs(title = "Dissolved Oxygen Data + Sampling Dates", caption = "Dotted line = hypoxic threshold of 2 mg/L dissolved oxygen, vertical lines = DNA sampling times", alpha = "Environmental Data", color = "Source")
## Warning: Removed 236 rows containing missing values or values outside the scale range
## (`geom_point()`).

Data Join

messyjoin <- full_join(EnvData1, SpeciesDetections, by = join_by(date == Date_UTC)) # did not work, unsurprised
EnvRd <- EnvData1 %>% 
  mutate(DateMatch = round_date(date, unit = "10 minutes")) # well at least it didn't immediately explode. However, this still has many datapoints per hour. Maybe round to the nearest 10 minutes? 
DetectRd <- SpeciesDetections %>% 
  mutate(DateMatch = round_date(Date_UTC, unit = "10 minutes"), Date_local_hr = round_date(Date_local, unit = "hour")) # Spot check - looks good. 

messyrdjoin <- full_join(EnvRd, DetectRd, by = join_by(DateMatch)) # Many to many is probably expected - There's several detections per time point

# Ok, that was pretty good! To try and avoid many-to-many, perhaps a specific join. Don't care about EnvHr without DetectHr matches, so make DetectHr x and do a left join (A left_join() keeps all observations in x.)

eDNAxEnvData <- left_join(DetectRd, EnvRd, by = join_by(DateMatch)) 

investigate <- eDNAxEnvData %>% select(Species, DateMatch, Date_UTC, Date_local_hr, source, temperature, DO, SampleId, Rosette_position, Amplicon)

Joined data graphs

ggplot(eDNAxEnvData, aes(x = DateMatch, y = DO, shape = Present, size = Present, color = Present)) +
  scale_shape_manual(values = c(1, 19)) +
  scale_size_manual(values = c(1,1)) +
  geom_point() +
  theme_bw() +
  facet_wrap(facets = vars(Species))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = EnvData1, aes(x = date, y = DO)) +
  geom_line(color = "gray90") +
  geom_point(data = eDNAxEnvData, aes(x = DateMatch, y = DO, shape = Present, size = Present, color = Present)) +
  scale_shape_manual(values = c(1, 19)) +
  scale_size_manual(values = c(1,1)) + 
  theme_bw() +
  facet_wrap(facets = vars(Species, year(date)), scales = "free_x", ncol = 2)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

EnvDataRange <- EnvData1 %>% 
  filter(date > as.POSIXct("2021-08-15 00:00:00", tz = "UTC"))

ggplot(data = EnvDataRange, aes(x = date, y = DO)) +
  geom_line(color = "gray90") +
  geom_point(data = eDNAxEnvData, aes(x = DateMatch, y = DO, shape = Present, size = Present, color = Present)) +
  scale_shape_manual(values = c(1, 19)) +
  scale_size_manual(values = c(1,1)) + 
  theme_bw() +
  facet_wrap(facets = vars(Species, year(date)), scales = "free_x", ncol = 4) +
  scale_x_datetime(breaks = "month", date_labels = "%b-%y") +
  theme(text = element_text(size = 20), axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(size = 8), strip.background = element_rect(fill = "gray95")) +
  geom_hline(yintercept = 2, linetype = 2) +
  labs(title = "Species Presence and Absence Over Dissolved Oxygen", x = "Date", y = "Dissolved Oxygen (mg/L) At TH042")
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(filename = here("OCNMS_Project", "Plots", "SpeciesPresence_Oxygen_Preliminary.png"), width = 2500, height = 2000, units = "px")
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(data = EnvDataRange, aes(x = date, y = temperature)) +
  geom_line(color = "gray90") +
  geom_point(data = eDNAxEnvData, aes(x = DateMatch, y = temperature, shape = Present, size = Present, color = Present)) +
  scale_shape_manual(values = c(1, 19)) +
  scale_size_manual(values = c(1,1)) + 
  theme_bw() +
  facet_wrap(facets = vars(Species, year(date)), scales = "free_x", ncol = 4) +
  scale_x_datetime(breaks = "month", date_labels = "%b-%y") +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(size = 8), strip.background = element_rect(fill = "gray95")) +
  labs(title = "Species Presence and Absence Over Temperature", x = "Date", y = "Temperature (C) At TH042")

ggsave(filename = here("OCNMS_Project", "Plots", "SpeciesPresence_Temp_Preliminary.png"), width = 2500, height = 2000, units = "px")

DO x Temp graphs

ggplot(EnvDataRange, aes(x = temperature, y = DO, shape = source, color = as.factor(year))) +
  scale_shape_manual(values = c(4,19)) +
  scale_color_manual(values = c("blue", "black")) +
  geom_point(shape = 1, alpha = 0.5) +
  theme_bw() +
  labs(title = "DO vs Temp during sampling years")
## Warning: Removed 231 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(here("OCNMS_eDNA", "Plots", "DO_vs_Temp.png"))
## Saving 7 x 5 in image
## Warning: Removed 231 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(EnvDataRange, aes(x = temperature, y = DO, shape = source, color = as.factor(year))) +
  scale_shape_manual(values = c(4,19)) +
  scale_color_manual(values = c("blue", "black")) +
  geom_point(shape = 1, alpha = 0.5) +
  theme_bw() +
  facet_wrap(facets = vars(year)) +
  labs(title = "DO vs Temp during sampling years")
## Warning: Removed 231 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(here("OCNMS_eDNA", "Plots", "DO_vs_TempYear.png"))
## Saving 7 x 5 in image
## Warning: Removed 231 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(EnvDataRange, aes(x = temperature, y = DO, shape = source, color = as.factor(year))) +
  scale_shape_manual(values = c(4,19)) +
  scale_color_manual(values = c("blue", "black")) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_x_continuous(limits = c(7, 10)) +
  scale_y_continuous(limits = c(0,6)) +
  theme_bw() +
  labs(title = "DO vs Temp during sampling years (Zoomed in)")
## Warning: Removed 1044 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(here("OCNMS_eDNA", "Plots", "DO_vs_Temp_Zoomed.png"))
## Saving 7 x 5 in image
## Warning: Removed 1044 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(EnvDataRange, aes(x = temperature, y = DO, shape = source, color = as.factor(year))) +
  scale_shape_manual(values = c(4,19)) +
  scale_color_manual(values = c("blue", "black")) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_x_continuous(limits = c(7, 10)) +
  scale_y_continuous(limits = c(0,6)) +
  theme_bw() +
  facet_wrap(facets = vars(year)) +
  labs(title = "DO vs Temp during sampling years (Zoomed in)")
## Warning: Removed 1044 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(here("OCNMS_eDNA", "Plots", "DO_vs_Temp_ZoomedYear.png"))
## Saving 7 x 5 in image
## Warning: Removed 1044 rows containing missing values or values outside the scale range
## (`geom_point()`).

Binomial Regression

joinSpeciesList <- split(eDNAxEnvData, eDNAxEnvData$Species)

oxmodels <- lapply(joinSpeciesList, glm, formula = Present ~ DO, family = "binomial")
lapply(oxmodels, summary)
## $`Acartia longiremis`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   2.0986     0.8390   2.501   0.0124 *
## DO            0.2513     0.3884   0.647   0.5176  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 51.015  on 101  degrees of freedom
## Residual deviance: 50.572  on 100  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 54.572
## 
## Number of Fisher Scoring iterations: 5
## 
## 
## $`Calanus pacificus`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.07917    0.68168   0.116  0.90754   
## DO          -1.40949    0.46551  -3.028  0.00246 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 65.434  on 101  degrees of freedom
## Residual deviance: 52.869  on 100  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 56.869
## 
## Number of Fisher Scoring iterations: 6
## 
## 
## $`Clupea pallasii`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  1.02402    0.48112   2.128   0.0333 *
## DO          -0.04661    0.19676  -0.237   0.8127  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 121.78  on 101  degrees of freedom
## Residual deviance: 121.73  on 100  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 125.73
## 
## Number of Fisher Scoring iterations: 4
## 
## 
## $`Oncorhynchus tshawytscha`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.3263     0.8270  -4.022 5.77e-05 ***
## DO            0.3668     0.2896   1.267    0.205    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 56.084  on 101  degrees of freedom
## Residual deviance: 54.591  on 100  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 58.591
## 
## Number of Fisher Scoring iterations: 5
tmodels <- lapply(joinSpeciesList, glm, formula = Present ~ temperature, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lapply(tmodels, summary)
## $`Acartia longiremis`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -59.444     29.709  -2.001   0.0454 *
## temperature    8.103      3.906   2.074   0.0381 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 51.157  on 102  degrees of freedom
## Residual deviance: 44.081  on 101  degrees of freedom
## AIC: 48.081
## 
## Number of Fisher Scoring iterations: 8
## 
## 
## $`Calanus pacificus`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -4.7330     3.5421  -1.336    0.181
## temperature   0.3182     0.4458   0.714    0.475
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 65.639  on 102  degrees of freedom
## Residual deviance: 65.215  on 101  degrees of freedom
## AIC: 69.215
## 
## Number of Fisher Scoring iterations: 5
## 
## 
## $`Clupea pallasii`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   3.1914     3.0248   1.055    0.291
## temperature  -0.2938     0.3848  -0.764    0.445
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 124.28  on 102  degrees of freedom
## Residual deviance: 123.69  on 101  degrees of freedom
## AIC: 127.69
## 
## Number of Fisher Scoring iterations: 4
## 
## 
## $`Oncorhynchus tshawytscha`
## 
## Call:
## FUN(formula = ..1, family = "binomial", data = X[[i]])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -0.5453     7.4505  -0.073    0.942
## temperature  -0.2474     0.9566  -0.259    0.796
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 56.246  on 102  degrees of freedom
## Residual deviance: 56.163  on 101  degrees of freedom
## AIC: 60.163
## 
## Number of Fisher Scoring iterations: 5

Check for outliers

ggplot(EnvDataRange, aes(x = DO, y = temperature)) +
  geom_point(color = "gray90", alpha = 0.5) +
  geom_point(eDNAxEnvData, mapping = aes(x = DO, y = temperature, color = Present, shape = Present), inherit.aes = F) +
    scale_shape_manual(values = c(1, 19)) +
  theme_bw() +
  theme(text = element_text(size = 15), strip.text = element_text(size = 8), strip.background = element_rect(fill = "gray95")) +
  labs(title = "Dissolved Oxygen vs. Temp + Species Presence", y = "Temperature") +
  facet_wrap(facets = vars(Species))
## Warning: Removed 924 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(filename = here("OCNMS_Project", "Plots", "SpeciesPresence_TempxDO.png"), width = 2500, height = 2000, units = "px")
## Warning: Removed 924 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

Check for linearity

“Assumption #5 involves the necessity of a linear relationship between the continuous independent variables and the logit transformation of the dependent variable. This linearity assumption implies that for continuous independent variables like income level, hours of exercise per week, and blood sugar levels, there should be a linear relationship with the logit of the dependent variable, such as the probability of developing diabetes. Various methods can be employed to assess this linearity, with one common approach being the Box-Tidwell procedure. This technique involves creating interaction terms between each continuous independent variable and its natural logarithm and adding these to the logistic regression model. This technique can be implemented using software like SPSS Statistics, which offers the Binary Logistic procedure to test for this assumption. The results of this test are then interpreted to decide the next steps in the analysis, depending on whether the linearity assumption holds or is violated. If the assumption is met, the analysis can proceed as planned. However, if the assumption is not met, adjustments to the model or alternative methods may be necessary to address the non-linearity appropriately.” - Binomial Logistic Regression

Basically, this equation needs to have a somewhat linear relationship with the independent variable:

\[g(p) = log\left( \frac{p}{1-p} \right)\] Where p = probability of 1 (if 1, p. if 0, 1-p)?

library(car) # Has a function for the Box-Tidwell procedure
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
?boxTidwell

joinSpeciesList <- lapply(joinSpeciesList, drop_na, DO)

# joinSpeciesList, glm, formula = Present ~ DO

# Single test
boxTidwell(Present ~ DO, other.x = ~ year, data = joinSpeciesList[[1]]) # other.x = any factors not to be transformed. i had to make year into a factor to make it accept this lmao
## Warning in boxTidwell.default(y, X1, X2, max.iter = max.iter, tol = tol, :
## maximum iterations exceeded
##  MLE of lambda Score Statistic (t) Pr(>|t|)
##         229.08              0.3369   0.7369
## 
## iterations =  26
# Loop to test all for linearity
for (i in 1:length(joinSpeciesList)) {
  print(paste(names(joinSpeciesList)[i], sep = " ", "Presence vs DO"))
  print(boxTidwell(Present ~ DO, other.x = ~ year, data = joinSpeciesList[[i]]))
}
## [1] "Acartia longiremis Presence vs DO"
## Warning in boxTidwell.default(y, X1, X2, max.iter = max.iter, tol = tol, :
## maximum iterations exceeded
##  MLE of lambda Score Statistic (t) Pr(>|t|)
##         229.08              0.3369   0.7369
## 
## iterations =  26 
## [1] "Calanus pacificus Presence vs DO"
## Warning in boxTidwell.default(y, X1, X2, max.iter = max.iter, tol = tol, :
## maximum iterations exceeded
##  MLE of lambda Score Statistic (t) Pr(>|t|)   
##        -7.4201              2.8025 0.006112 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## iterations =  26 
## [1] "Clupea pallasii Presence vs DO"
##  MLE of lambda Score Statistic (t) Pr(>|t|)   
##        -29.763             -2.8066 0.006041 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## iterations =  11 
## [1] "Oncorhynchus tshawytscha Presence vs DO"
##  MLE of lambda Score Statistic (t) Pr(>|t|)
##       0.043619             -0.7706   0.4428
## 
## iterations =  15
for (i in 1:length(joinSpeciesList)) {
  print(paste(names(joinSpeciesList)[i], sep = " ", "Presence vs Temp"))
  print(boxTidwell(Present ~ temperature, other.x = ~ year, data = joinSpeciesList[[i]]))
}
## [1] "Acartia longiremis Presence vs Temp"
## Warning in boxTidwell.default(y, X1, X2, max.iter = max.iter, tol = tol, :
## maximum iterations exceeded
##  MLE of lambda Score Statistic (t) Pr(>|t|)
##        -13.576             -1.2963   0.1979
## 
## iterations =  26 
## [1] "Calanus pacificus Presence vs Temp"
##  MLE of lambda Score Statistic (t) Pr(>|t|)  
##        -11.882              -2.008  0.04739 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## iterations =  8 
## [1] "Clupea pallasii Presence vs Temp"
## Warning in boxTidwell.default(y, X1, X2, max.iter = max.iter, tol = tol, :
## maximum iterations exceeded
##  MLE of lambda Score Statistic (t) Pr(>|t|)
##         5.2474             -1.1528   0.2518
## 
## iterations =  26 
## [1] "Oncorhynchus tshawytscha Presence vs Temp"
##  MLE of lambda Score Statistic (t) Pr(>|t|)
##         1.6114             -0.0447   0.9644
## 
## iterations =  15